library(ISLR)
library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels)
library(stringr)
library(splitstackshape)
library(lubridate)
library(rpart.plot)
library(cluster)
library(forcats)
tidymodels_prefer()
library(probably) #install.packages('probably')
library(vip)
imdb_top_1000 <- read_csv("~/Desktop/Statistical Machine Learning/R Files/Final Project/imdb_top_1000_CLEAN.csv")
imdb_clean <- imdb_top_1000 %>%
cSplit("Genre", sep = ",", direction = "wide") %>%
mutate(Gross = log(Revenue-Budget))
runtime_clean <- imdb_top_1000$Runtime %>%
str_replace(" min", "") %>%
as.numeric()
imdb_clean$Runtime <- runtime_clean
imdb_clean <- imdb_clean %>%
filter(Gross != "-Inf") %>%
drop_na(Gross, Budget)
head(imdb_clean)
data_cv10 <- vfold_cv(imdb_clean, v = 10)
# Model Spec
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
# Recipe
full_lm_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross, Runtime, IMDB_Rating, Meta_score, No_of_Votes)
# Workflow
full_lm_wf <- workflow() %>%
add_recipe(full_lm_rec) %>%
add_model(lm_spec)
full_lm_model <- fit(full_lm_wf, data = imdb_clean)
full_lm_model %>% tidy()
full_lm_modelcv <- fit_resamples(full_lm_wf, resamples = data_cv10, metrics = metric_set(rmse, rsq, mae))
full_lm_modelcv %>%
collect_metrics()
# Lasso Model Spec with tune
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% # mixture = 1 indicates Lasso
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# Recipe
data_rec_lasso <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value (don't want duplicates)
step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # standardization important step for LASSO
step_dummy(all_nominal_predictors()) %>% # creates indicator variables for categorical variables
step_naomit(Gross, Runtime, IMDB_Rating, # omit any NA values
Meta_score, No_of_Votes)
# Workflow
lasso_wf_tune <- workflow() %>%
add_recipe(data_rec_lasso) %>%
add_model(lm_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-3, 1)),
levels = 30)
tune_res <- tune_grid(
lasso_wf_tune,
resamples = data_cv10,
metrics = metric_set(rmse, mae),
grid = penalty_grid
)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_res) + theme_classic()
# Collect CV Metrics and Select Best Model
# Summarize Model Evaluation Metrics (CV)
lasso_mod <- collect_metrics(tune_res) %>%
filter(.metric == 'rmse') %>%
select(penalty, rmse = mean)
# Choose penalty value
best_penalty <- select_best(tune_res, metric = 'rmse')
lasso_mod
# Fit Final Model
final_wf <- finalize_workflow(lasso_wf_tune, best_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = imdb_clean)
lasso_fit <- fit_resamples(final_wf, resamples = data_cv10, metrics = metric_set(rmse, rsq, mae))
tidy(final_fit)
# Final ("best") model predictors and coefficients
final_fit %>% tidy() %>% filter(estimate != 0)
lasso_fit %>%
collect_metrics()
lasso_mod_out <- final_fit %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
ggplot(lasso_mod_out, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = Runtime, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = IMDB_Rating, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = No_of_Votes, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Build the GAM
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod1 <- fit(gam_spec,
Gross ~ s(Runtime, k = 20) + s(IMDB_Rating) + Meta_score + s(No_of_Votes) + Genre_1,
data = imdb_clean
)
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
gam_mod1 %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 9 iterations.
## The RMS GCV score gradient at convergence was 3.345103e-06 .
## The Hessian was positive definite.
## Model rank = 51 / 51
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Runtime) 19.00 14.98 0.97 0.26
## s(IMDB_Rating) 9.00 4.25 0.94 0.07 .
## s(No_of_Votes) 9.00 3.99 1.06 0.91
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnostics: Check to see if the number of knots is large enough
gam_mod1 %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Gross ~ s(Runtime, k = 20) + s(IMDB_Rating) + Meta_score + s(No_of_Votes) +
## Genre_1
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.832784 0.429340 39.206 < 2e-16 ***
## Meta_score 0.010241 0.005299 1.932 0.05387 .
## Genre_1Adventure -0.044096 0.247036 -0.179 0.85840
## Genre_1Animation 1.346910 0.301529 4.467 9.81e-06 ***
## Genre_1Biography 0.223135 0.234887 0.950 0.34258
## Genre_1Comedy 0.267256 0.227633 1.174 0.24093
## Genre_1Crime -0.581492 0.232807 -2.498 0.01282 *
## Genre_1Drama -0.334477 0.190120 -1.759 0.07914 .
## Genre_1Family -0.175638 0.962267 -0.183 0.85524
## Genre_1Film-Noir -2.558791 0.976808 -2.620 0.00907 **
## Genre_1Horror 0.512855 0.502375 1.021 0.30781
## Genre_1Mystery -1.175425 0.571692 -2.056 0.04029 *
## Genre_1Thriller -0.520050 1.354114 -0.384 0.70110
## Genre_1Western -0.455014 0.817789 -0.556 0.57819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Runtime) 14.979 17.114 5.202 <2e-16 ***
## s(IMDB_Rating) 4.254 5.211 19.128 <2e-16 ***
## s(No_of_Votes) 3.989 4.958 68.466 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.521 Deviance explained = 55.4%
## GCV = 1.9035 Scale est. = 1.7723 n = 540
# Visualize: Look at the estimated non-linear functions
gam_mod1 %>% pluck('fit') %>% plot()
gam1_output <- gam_mod1%>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
gam1_output %>%
rsq(truth = Gross, estimate = .pred)
spline_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross) %>%
step_ns(Runtime, deg_free = 19) %>%
step_ns(No_of_Votes, deg_free = 9) %>%
step_ns(IMDB_Rating, deg_free = 9)
spline_rec %>% prep(imdb_clean) %>% juice()
# Build the GAM
lm_spec_gam <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
spline_wf <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(spline_rec)
cv_output_spline2 <- fit_resamples(
spline_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(mae,rmse,rsq)
)
cv_output_spline2 %>% collect_metrics()
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
cv_output_spline2 %>% collect_metrics()
The GAM created using TidyModels performs better than the recipe and GAMs. Likely has to do with the degrees of freedom of the splines.
# Visualize Residuals
gam1_output <- new_mod %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
## Error in predict(., new_data = imdb_clean): object 'new_mod' not found
ggplot(gam1_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = Runtime, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = IMDB_Rating, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = No_of_Votes, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
There does not appear to be any significant bias after analyzing the residuals.
full_lm_modelcv %>% collect_metrics() #OLS
lasso_fit %>% collect_metrics() #LASSO
gam1_output %>%
rmse(truth = Gross, estimate = .pred) #GAM
The GAM with Splines obtained using TidyModels performs the best with the lowest RMSE value (1.3 compared to 1.4 for the others).
imdb_clean_class <- imdb_clean %>%
mutate(success_ratio = Revenue/Budget) %>%
mutate(flop = as.factor(ifelse(success_ratio > 2, 'FALSE', 'TRUE'))) %>%
drop_na(flop, No_of_Votes,Runtime, IMDB_Rating,Meta_score,Genre_1)
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(trees = 1000, # Number of trees
min_n = NULL,
probability = FALSE,
importance = 'impurity') %>%
set_mode('classification')
# Recipe
data_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Meta_score + Genre_1,
data = imdb_clean_class) %>%
step_naomit(flop, No_of_Votes, Runtime, IMDB_Rating, Meta_score, Genre_1)
# Create Workflows
data_wf_mtry3 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 3)) %>%
add_recipe(data_rec)
data_wf_mtry4 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 4)) %>%
add_recipe(data_rec)
data_wf_mtry5 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 5)) %>%
add_recipe(data_rec)
# Fit Models
set.seed(123) # make sure to run this before each fit so that you have the same 1000 trees
data_fit_mtry3 <- fit(data_wf_mtry3, data = imdb_clean_class)
set.seed(123)
data_fit_mtry4 <- fit(data_wf_mtry4, data = imdb_clean_class)
set.seed(123)
data_fit_mtry5 <- fit(data_wf_mtry5, data = imdb_clean_class)
# Custom Function to get OOB predictions, true observed outcomes and add a user-provided model label
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
flop = truth,
model = model_label
)
}
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(
rf_OOB_output(data_fit_mtry3,3, imdb_clean_class %>% pull(flop)),
rf_OOB_output(data_fit_mtry4,4, imdb_clean_class %>% pull(flop)),
rf_OOB_output(data_fit_mtry5,5, imdb_clean_class %>% pull(flop))
)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class) %>%
mutate(mtry = as.numeric(stringr::str_replace(model,'mtry',''))) %>%
ggplot(aes(x = mtry, y = .estimate )) +
geom_point() +
geom_line() +
theme_classic()
rf_OOB_output(data_fit_mtry4,4, imdb_clean_class %>% pull(flop)) %>%
conf_mat(truth = flop, estimate= .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 468 65
## TRUE 7 0
# Impurity
model_output <-data_fit_mtry4 %>%
extract_fit_engine()
model_output %>%
vip(num_features = 10) + theme_classic() #based on impurity, 10 meaning the top 10
model_output %>% vip::vi() %>% head()
model_output %>% vip::vi() %>% tail()
# Permutation
model_output2 <- data_wf_mtry4 %>%
update_model(rf_spec %>% set_args(importance = "permutation")) %>% #based on permutation
fit(data = imdb_clean_class) %>%
extract_fit_engine()
model_output2 %>%
vip(num_features = 10) + theme_classic()
model_output2 %>% vip::vi() %>% head()
model_output2 %>% vip::vi() %>% tail()
ggplot(imdb_clean_class, aes(x = flop, y = No_of_Votes)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Runtime)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = IMDB_Rating)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Meta_score)) +
geom_violin() + theme_classic()
set.seed(123)
# Logistic Regression Model Spec
logistic_spec <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
# Recipe
logistic_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Genre_1,
data = imdb_clean_class)
# Workflow (Recipe + Model) for Full Log Model
log_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_spec)
# Fit Model
log_fit <- fit(log_wf, data = imdb_clean_class)
tidy(log_fit)
log_fit %>% tidy() %>%
mutate(OR = exp(estimate))
# Creation of CV Folds
data_cv10_class <- vfold_cv(imdb_clean_class, v = 10)
log_modelcv <- fit_resamples(log_wf, resamples = data_cv10_class, metrics = metric_set(accuracy,sens,yardstick::spec))
log_modelcv %>%
collect_metrics()
final_output <- log_fit %>% predict(new_data = imdb_clean_class, type='prob') %>% bind_cols(imdb_clean_class)
final_output %>%
ggplot(aes(x = flop, y = .pred_TRUE)) +
geom_boxplot()
# Use soft predictions
final_output %>%
roc_curve(flop,.pred_TRUE,event_level = 'second') %>%
autoplot()
# Thresholds in terms of reference level
threshold_output <- final_output %>%
threshold_perf(truth = flop, estimate = .pred_FALSE, thresholds = seq(0,1,by=.01))
# J-index v. Threshold for no flop
threshold_output %>%
filter(.metric == 'j_index') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'J-index', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'j_index') %>%
arrange(desc(.estimate))
# Distance vs. Threshold
threshold_output %>%
filter(.metric == 'distance') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'Distance', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'distance') %>%
arrange(.estimate)
log_metrics <- metric_set(accuracy,sens,yardstick::spec)
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = .12)) %>%
log_metrics(truth = flop, estimate = .pred_class, event_level = 'second')
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = .12)) %>%
conf_mat(truth = flop, estimate = .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 475 65
## TRUE 0 0
predict(log_fit, new_data = data.frame(No_of_Votes = 10000, Runtime = 112, IMDB_Rating = 9.8,
Genre_1 = "Drama"), type = "prob"
)
We manually performed the hard predictions.
ggplot(imdb_clean, aes(x = Budget, y = Runtime)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = Budget, y = Gross)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = No_of_Votes, y = Runtime)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = Gross, y = No_of_Votes)) +
geom_point() + theme_classic()
imdb_sub <- imdb_clean %>%
select(Budget, Runtime)
set.seed(253)
# Data-specific function to cluster and calculate total within-cluster SS
imdb_cluster_ss <- function(k){
# Perform clustering
kclust <- kmeans(scale(imdb_sub), centers = k)
# Return the total within-cluster sum of squares
return(kclust$tot.withinss)
}
tibble(
k = 1:15,
tot_wc_ss = purrr::map_dbl(1:15, imdb_cluster_ss)
) %>%
ggplot(aes(x = k, y = tot_wc_ss)) +
geom_point() +
labs(x = "Number of clusters",y = 'Total within-cluster sum of squares') +
theme_classic()
kclust_k8 <- kmeans(scale(imdb_sub), centers = 8)
kclust_k8$cluster # Display cluster assignments
## [1] 1 5 8 2 4 2 5 2 8 6 6 6 5 2 6 1 8 1 6 2 7 1 7 1 1 7 1 6 5 6 1 7 7 4 6 5 7
## [38] 7 7 4 6 8 5 5 8 7 7 5 5 7 4 7 7 1 7 1 6 7 7 7 1 1 4 2 4 7 1 5 5 5 7 1 1 5
## [75] 1 2 1 1 1 7 4 7 7 4 5 2 1 6 8 6 5 5 8 6 5 1 8 5 6 7 1 5 2 1 1 1 1 4 5 1 2
## [112] 1 7 7 1 4 7 1 5 7 6 1 1 1 4 1 5 8 7 6 4 4 8 5 1 1 1 4 2 7 3 6 5 3 3 3 7 4
## [149] 4 2 7 3 1 1 4 1 5 7 4 2 7 1 4 2 1 1 2 1 1 1 1 2 7 5 7 7 1 1 2 7 4 4 1 5 1
## [186] 7 6 8 4 7 8 1 7 7 5 3 5 1 7 7 3 6 3 1 7 8 5 1 1 6 8 3 3 2 2 1 7 1 1 1 4 4
## [223] 7 5 4 2 4 2 1 1 4 1 7 2 1 4 1 1 7 5 1 7 2 7 1 7 7 1 5 2 4 1 5 1 4 7 7 1 4
## [260] 4 5 7 7 4 7 7 7 7 3 7 8 7 8 1 4 7 7 7 1 5 8 7 1 7 6 4 8 4 1 1 6 7 3 3 5 7
## [297] 5 7 7 1 7 4 4 1 4 1 1 1 5 4 7 7 6 2 2 7 4 7 4 7 7 7 7 1 3 1 8 5 8 1 1 1 4
## [334] 8 1 6 6 5 8 5 7 4 4 4 8 1 1 7 6 8 1 7 1 8 2 1 1 1 1 1 4 7 7 6 3 8 1 5 1 7
## [371] 1 1 4 7 7 1 1 3 1 7 4 7 7 5 4 1 7 7 7 1 7 7 7 7 7 7 5 1 2 1 7 4 4 7 7 7 4
## [408] 6 1 7 8 5 7 8 4 4 3 3 8 1 1 7 8 1 3 4 8 6 7 7 1 6 7 8 5 1 3 7 5 7 8 6 3 8
## [445] 7 1 7 6 3 7 7 4 3 4 7 6 6 1 1 4 2 1 4 2 3 1 1 3 5 1 4 4 1 7 4 4 4 4 4 1 7
## [482] 7 7 7 5 7 5 1 1 7 7 7 1 7 1 7 1 8 7 1 1 1 7 4 8 7 7 7 3 7 7 7 1 4 7 6 1 7
## [519] 3 3 6 1 7 3 8 3 1 6 7 7 4 1 7 7 7 7 3 6 6 6 7 4 5 3 7 5 1 6 1 6 7 7 1 7 5
## [556] 4 4 4 7 7 4 4 4 7 4 1 1 5 4 7 2 7
imdb_clean <- imdb_clean %>%
mutate(kclust_8 = factor(kclust_k8$cluster))
# Visualize the cluster assignments on the original scatterplot
imdb_clean %>%
ggplot(aes(x = Budget, y = Runtime, color = kclust_8)) +
geom_point() + theme_classic()
# Count of Movies per Genre (Primary Genre)
imdb_clean %>%
count(Genre_1)
# Count of Movies per Genre (Secondary Genre)
imdb_clean %>%
count(Genre_2)
# Count of Movies per Genre (Overall Genre)
imdb_clean %>%
count(New_Genre)
# Genres vs Cluster
# How many of each Genre 1 in each cluster
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_1)
# How many of each Genre 2 in each cluster
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_2)
# How many movies in each cluster
imdb_clean%>%
count(kclust_8)
# Genre 1
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = Genre_1)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
# Genre 2
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = Genre_2)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
# Overall Genre
imdb_clean %>%
ggplot(aes(x = kclust_8, fill = New_Genre)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
# Random subsample of 25 Movies
set.seed(253)
imdb_hc <- imdb_clean %>%
slice_sample(n = 25)
# Select the variables to be used in clustering
imdb_hc_sub <- imdb_hc %>%
select(Gross, Budget)
imdb_hc_full <- imdb_clean %>%
select(Gross, Budget)
# Summary statistics for the variables
summary(imdb_hc_sub)
## Gross Budget
## Min. :13.82 Min. : 0
## 1st Qu.:15.98 1st Qu.: 3000000
## Median :17.54 Median : 18500000
## Mean :17.34 Mean : 39318800
## 3rd Qu.:18.88 3rd Qu.: 60000000
## Max. :20.31 Max. :190000000
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(scale(imdb_hc_sub)) # Subset Distance Matrix
dist_mat_full <- dist(scale(imdb_hc_full)) # Full Data Distance Matrix
imdb_hc_avg <- hclust(dist_mat_scaled, method = "average") # Subset
imdb_full_avg <- hclust(dist_mat_full, method = "average") # Full Data
# Plot dendrogram on Subset
plot(imdb_hc_avg)
# Adding Genre Labels
plot(imdb_hc_avg, labels = imdb_hc$Genre_1)
plot(imdb_hc_avg, labels = paste(imdb_hc$Genre_1, imdb_hc$Genre_2))
plot(imdb_hc_avg, labels = paste(imdb_hc$Genre_1, imdb_hc$Genre_2, imdb_hc$Genre_3))
plot(imdb_hc_avg, labels = paste(imdb_hc$New_Genre))
imdb_clean <- imdb_clean %>%
mutate(
hclust_num = factor(cutree(imdb_full_avg, k = 3)) # Cut into 6 clusters (k)
)
ggplot(imdb_clean, aes(x = hclust_num, fill = Genre_1)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
ggplot(imdb_clean, aes(x = hclust_num, fill = New_Genre)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()
ggplot(imdb_clean, aes(x = Budget, y = Gross, color = hclust_num)) +
geom_point() +
theme_classic()